The dataset offers a comprehensive collection of TEDx talks from the TedEx YouTube channel, featuring talks aimed at inspiring, educating, and sparking discussions on various important subjects. Each entry includes details such as the video ID, publication time, title, description, tags, category ID, default audio language, duration, dimension, caption availability, licensed content status, view count, like count, favorite count, and comment count. The dataset offers insights into the content and engagement metrics of these TedEx talk videos , showcasing diverse topics and audience responses.
yt_df <- readRDS("processed_youtube_df.rds")
summary(yt_df)
## Utc_Day_Part Month Day_Of_Week Title
## Length:20268 Length:20268 Length:20268 Length:20268
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Description Tags Duration_Minutes Default_Audio_Language
## Length:20268 Length:20268 Min. : 1.00 Length:20268
## Class :character Class :character 1st Qu.:10.00 Class :character
## Mode :character Mode :character Median :13.00 Mode :character
## Mean :12.71
## 3rd Qu.:16.00
## Max. :30.00
## NA's :83
## Caption View_Count Like_Count Comment_Count
## Mode :logical Min. : 12 Min. : 0.0 Min. : 0.000
## FALSE:17179 1st Qu.: 217 1st Qu.: 6.0 1st Qu.: 0.000
## TRUE :3089 Median : 465 Median : 14.0 Median : 1.000
## Mean : 10067 Mean : 271.8 Mean : 9.652
## 3rd Qu.: 1124 3rd Qu.: 36.0 3rd Qu.: 5.000
## Max. :21296229 Max. :125879.0 Max. :2698.000
##
glimpse(yt_df)
## Rows: 20,268
## Columns: 12
## $ Utc_Day_Part <chr> "Afternoon", "Afternoon", "Afternoon", "Afterno…
## $ Month <chr> "March", "March", "March", "March", "March", "M…
## $ Day_Of_Week <chr> "Tuesday", "Tuesday", "Tuesday", "Tuesday", "Tu…
## $ Title <chr> "The Great Diffusion | Alex Lazarow | TEDxSonom…
## $ Description <chr> "Over the last 150 years, unprecedented technol…
## $ Tags <chr> "Business,Economics,English,Entrepreneurship,Fu…
## $ Duration_Minutes <dbl> 10, 12, 11, 16, 16, 7, 6, 10, 12, 10, NA, 11, 1…
## $ Default_Audio_Language <chr> "en", "en", "en", "en", "en", "en", "pl", "pl",…
## $ Caption <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE,…
## $ View_Count <dbl> 77, 71, 313, 62, 180, 347, 119, 179, 90, 27419,…
## $ Like_Count <dbl> 3, 2, 13, 0, 10, 17, 4, 4, 3, 72, 1500, 18, 12,…
## $ Comment_Count <dbl> 0, 0, 4, 0, 0, 15, 1, 1, 0, 40, 49, 3, 0, 0, 5,…
data_check <- yt_df |> select(-Description)
p <- ggplot(data_check, aes(x=View_Count/1e6,
y=Like_Count/1e3,
color=factor(Day_Of_Week),
label1 = View_Count,
label2 = Like_Count,
label3 = Day_Of_Week)) +
geom_point(size=3) +
labs(x = 'Views (in millions )',
y = 'Likes (in thousands)',title = 'Scatter plot for views and likes comparison',
color = "Days") +
theme_bw()
ggplotly(p, tooltip = c("label1", "label2", "label3"))
#Language based bar and pie chart
language_yt_df <- yt_df |>
group_by(Default_Audio_Language) |>
summarise(n = n()) |>
arrange(desc(n)) |>
slice_head(n = 10)
language_lookup <- data.frame(Language_Code = c("ar","en", "es", "fr", "hi","it","pt","ro","tr","zh-CN"),
Language_Name = c("Arabic","English", "Spanish", "French", "Hindi","Italian","Portuguese","Romanian","Turkish","Chinese (PRC)"))
language_yt_df <- merge(language_yt_df, language_lookup, by.x = "Default_Audio_Language", by.y = "Language_Code", all.x = TRUE)
#bar chart
ggplot(language_yt_df) +
geom_bar(aes(x = Default_Audio_Language, y = n, fill = Default_Audio_Language), stat = 'identity') +
scale_y_continuous(labels = scales::comma) +
labs(x = 'Language', y = 'Frequency', title = 'Top 10 languages') +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(hjust = 1),
legend.title = element_blank()) +
scale_fill_discrete(name = "Language", labels = paste(language_yt_df$Default_Audio_Language, ":", language_yt_df$Language_Name))
days_df <- yt_df |> group_by(Day_Of_Week) |>
summarise(n = n())|>
mutate(percentage = round(n / sum(n), 2)) |>
mutate(label_perc = scales::percent(percentage))
# ggplot(days_df, aes(x = "", y = percentage, fill = Day_Of_Week)) +
# geom_col(color = "black") +
# geom_label(aes(label = label_perc),
# position = position_stack(vjust = 0.5),
# show.legend = FALSE) +
# guides(fill = guide_legend(title = "Exploring Daily Data")) +
# coord_polar(theta = "y") +
# theme_void()
plotly::plot_ly(days_df)%>%
add_pie(days_df,labels=~factor(Day_Of_Week),values=~n,
textinfo="label+percent",type='pie',hole=0.3)%>%
layout(title="Exploring Daily Data")
mvideo <- yt_df |> select(-Description) |> arrange(desc(yt_df$View_Count))
top_mvideo <- mvideo[1:10,]
#Titles of most viewed videos
ggplot(top_mvideo, aes(y = reorder(Title, View_Count),x = View_Count/1e6),stat = 'identity') +
geom_point()+
labs(y= 'Video Titles', x = 'Views (in Millions)', title = 'Top 10 Most viewed videos') +
theme(plot.title = element_text(hjust = 5),
axis.text.x = element_text(hjust = 1))
#time duration of most viewed videos
ggplot(top_mvideo, aes(y=reorder(Title, Duration_Minutes), x =Duration_Minutes ), stat = 'identity') +
geom_point() +
labs(x= 'Video length (in mins)', y = 'Video Titles', title = 'Time duration for most viewed videos') +
theme(plot.title = element_text(hjust = 2),
axis.text.x = element_text(hjust = 1))
emetrics_df <- yt_df |>
select(Like_Count,Comment_Count,View_Count,Duration_Minutes,Utc_Day_Part, Day_Of_Week,) |>
mutate(Engagement_Rate = ((Like_Count + Comment_Count) / View_Count) * 100) |>
mutate(week_index = case_when(
Day_Of_Week == "Monday" ~ 1,
Day_Of_Week == "Tuesday" ~ 2,
Day_Of_Week == "Wednesday" ~ 3,
Day_Of_Week == "Thursday" ~ 4, Day_Of_Week == "Friday" ~ 5, Day_Of_Week == "Saturday" ~ 6,
Day_Of_Week == "Sunday" ~ 7
))
gg<- ggplot(emetrics_df, aes(x=Duration_Minutes, y=Engagement_Rate, color=Utc_Day_Part)) +
geom_line() +
geom_point() +
scale_color_viridis(discrete = TRUE) +
ggtitle("Engagement Rate by Video Duration") +
guides(color = guide_legend(title = "Times of (the) day")) +
labs("Engagement_Rate")
ggplotly(gg)
emd <- ggplot(emetrics_df, aes(x=Day_Of_Week, y=Engagement_Rate, group=Duration_Minutes, color=Duration_Minutes,
label1 = Day_Of_Week,
label2 = Engagement_Rate,
label3 = Duration_Minutes)) +
geom_line() +
geom_point() +
scale_color_viridis(discrete = FALSE) +
ggtitle("Daily Engagement Rate") +
labs(
x = "Days (of week)",
y = "Rate of Engagement (in %)",
)
ggplotly(emd, tooltip = c("label1", "label2", "label3"))
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(dplyr)
library(tidyverse)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(skimr)
library(recipes)
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
library(kableExtra)
library(DALEXtra)
## Loading required package: DALEX
## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
##
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
##
## explain
yt_ml_df <- readRDS("processed_youtube_ml.rds")
Predictors in x_train_tbl Outcome in y_train_tbl
x_train_tbl_dl <- yt_ml_df |> select(-"Utc_Day_Part")
y_train_tbl_dl <- yt_ml_df |> select("Utc_Day_Part")
Predictors in x_train_tbl_gbm Outcome in y_train_tbl_gbm
x_train_tbl_gbm <- yt_ml_df |> select(-"Duration_Minutes")
y_train_tbl_gbm <- yt_ml_df |> select("Duration_Minutes")
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\ADMINI~1\AppData\Local\Temp\RtmpauFzYK\file432c7760531a/h2o_Administrator_started_from_r.out
## C:\Users\ADMINI~1\AppData\Local\Temp\RtmpauFzYK\file432c64952d7/h2o_Administrator_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 5 seconds 540 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.44.0.3
## H2O cluster version age: 4 months and 3 days
## H2O cluster name: H2O_started_from_R_Administrator_pyb003
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.50 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.3 (2024-02-29 ucrt)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (4 months and 3 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
saved_model_dl <- h2o.loadModel("4-dl-model-day-part.h2o")
saved_model_gbm <- h2o.loadModel("4-gbm-model-duration.h2o")
x_test_data <- data.frame(
Utc_Day_Part = "Evening",
Month = "February",
Day_Of_Week = "Friday",
Duration_Minutes = 14,
Default_Audio_Language = "en",
Caption = FALSE,
View_Count = 24904,
Like_Count = 658,
Comment_Count = 50
)
new_observation_tbl_skim = partition(skim(x_test_data))
names(new_observation_tbl_skim)
## [1] "character" "logical" "numeric"
string_2_factor_names_new_observation <- new_observation_tbl_skim$character$skim_variable
rec_obj_new_observation <- recipe(~ ., data = x_test_data) |>
step_string2factor(all_of(string_2_factor_names_new_observation)) |>
step_impute_median(all_numeric()) |> # missing values in numeric columns
step_impute_mode(all_nominal()) |> # missing values in factor columns
prep()
new_observation_processed_tbl <- bake(rec_obj_new_observation, x_test_data)
new_application = new_observation_processed_tbl
#For Deep Learning ### XAI (Expalinable AI)
h2o_exp_dl = explain_h2o(
saved_model_dl, data = x_train_tbl_dl,
y = y_train_tbl_dl$Utc_Day_Part == 1,
label = "H2O", type = "classification")
h2o_exp_dl_pdp <- model_profile(
explainer = h2o_exp_dl, variables = "View_Count")
plot(h2o_exp_dl_pdp, geom="profiles") +
ggtitle("View_Count")
h2o_exp_dl_cp <- predict_profile(
explainer = h2o_exp_dl, new_observation = new_application)
plot(h2o_exp_dl_cp, variables = c("View_Count","Duration_Minutes")) +
ggtitle("View_Count")
h2o_exp_dl_shap <- predict_parts(
explainer = h2o_exp_dl, new_observation = new_application,
type = "shap", B = 5)
plot(h2o_exp_dl_shap) + ggtitle("SHAP explaination")
#For Gradient Boosting Machines ### XAI (Expalinable AI)
h2o_exp_gbm = explain_h2o(
saved_model_gbm, data = x_train_tbl_gbm,
y = y_train_tbl_gbm$Duration_Minutes == 1,
label = "H2O", type = "classification")
h2o_exp_gbm_pdp <- model_profile(
explainer = h2o_exp_gbm, variables = "View_Count")
plot(h2o_exp_gbm_pdp, geom="profiles") +
ggtitle("View_Count")
h2o_exp_gbm_cp <- predict_profile(
explainer = h2o_exp_gbm, new_observation = new_application)
plot(h2o_exp_gbm_cp, variables = c("View_Count","Day_Of_Week")) +
ggtitle("View_Count")
## Non-numerical variables (from the 'variables' argument) are rejected.
h2o_exp_gbm_shap <- predict_parts(
explainer = h2o_exp_gbm, new_observation = new_application,
type = "shap", B = 5)
plot(h2o_exp_gbm_shap) + ggtitle("SHAP explaination")